From 3889bf91b297ae2a7329565fed7845acedf443a1 Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Fri, 25 Mar 2005 20:03:52 +0000 Subject: [PATCH] bitkeeper revision 1.1236.43.15 (42446ea83i0TVEFNdNTE8D6WBPWfaQ) Move Linux 2.4 to writable pagetables. It doesn't boot, but that bug is not caused by this changeset (I see exactly the same behaviour with these changes backed out). Will need some investigation: first on 2.0-testing to see if any fixes are needed there... Signed-off-by: Keir Fraser --- .rootkeys | 3 - .../arch/xen/kernel/head.S | 5 +- linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c | 1 + .../arch/xen/kernel/traps.c | 1 + linux-2.4.29-xen-sparse/arch/xen/mm/fault.c | 1 - linux-2.4.29-xen-sparse/arch/xen/mm/init.c | 13 +- linux-2.4.29-xen-sparse/fs/exec.c | 1179 --------------- .../include/asm-xen/page.h | 17 +- .../include/asm-xen/pgalloc.h | 19 +- .../include/asm-xen/pgtable-2level.h | 34 +- .../include/asm-xen/pgtable.h | 45 +- linux-2.4.29-xen-sparse/mm/highmem.c | 1 - linux-2.4.29-xen-sparse/mm/memory.c | 28 +- linux-2.4.29-xen-sparse/mm/mremap.c | 2 - linux-2.4.29-xen-sparse/mm/swapfile.c | 1267 ----------------- linux-2.4.29-xen-sparse/mm/vmalloc.c | 385 ----- linux-2.6.11-xen-sparse/arch/xen/Kconfig | 4 - .../arch/xen/configs/xen0_defconfig | 1 - .../arch/xen/configs/xenU_defconfig | 1 - .../arch/xen/i386/kernel/traps.c | 2 +- .../arch/xen/i386/mm/fault.c | 1 - .../arch/xen/i386/mm/hypervisor.c | 9 +- .../arch/xen/i386/mm/pgtable.c | 2 +- .../arch/xen/kernel/reboot.c | 2 - .../include/asm-xen/asm-i386/page.h | 2 +- .../include/asm-xen/asm-i386/pgtable.h | 2 - .../include/asm-xen/hypervisor.h | 2 - 27 files changed, 73 insertions(+), 2956 deletions(-) delete mode 100644 linux-2.4.29-xen-sparse/fs/exec.c delete mode 100644 linux-2.4.29-xen-sparse/mm/swapfile.c delete mode 100644 linux-2.4.29-xen-sparse/mm/vmalloc.c diff --git a/.rootkeys b/.rootkeys index ad520bef1b..8dca99fafd 100644 --- a/.rootkeys +++ b/.rootkeys @@ -167,7 +167,6 @@ 3e5a4e66rw65CxyolW9PKz4GG42RcA linux-2.4.29-xen-sparse/drivers/char/tty_io.c 40c9c0c1pPwYE3-4i-oI3ubUu7UgvQ linux-2.4.29-xen-sparse/drivers/scsi/aic7xxx/Makefile 41f97f64nW0wmgLxhwzPTzkF4E5ERA linux-2.4.29-xen-sparse/drivers/usb/hcd.c -3e5a4e669uzIE54VwucPYtGwXLAbzA linux-2.4.29-xen-sparse/fs/exec.c 3e5a4e66wbeCpsJgVf_U8Jde-CNcsA linux-2.4.29-xen-sparse/include/asm-xen/bugs.h 3e5a4e66HdSkvIV6SJ1evG_xmTmXHA linux-2.4.29-xen-sparse/include/asm-xen/desc.h 3e5a4e66SYp_UpAVcF8Lc1wa3Qtgzw linux-2.4.29-xen-sparse/include/asm-xen/fixmap.h @@ -205,8 +204,6 @@ 3f108af5VxPkLv13tXpXgoRKALQtXQ linux-2.4.29-xen-sparse/mm/mprotect.c 3e5a4e681xMPdF9xCMwpyfuYMySU5g linux-2.4.29-xen-sparse/mm/mremap.c 409ba2e7akOFqQUg6Qyg2s28xcXiMg linux-2.4.29-xen-sparse/mm/page_alloc.c -3e5a4e683HKVU-sxtagrDasRB8eBVw linux-2.4.29-xen-sparse/mm/swapfile.c -41180721bNns9Na7w1nJ0ZVt8bhUNA linux-2.4.29-xen-sparse/mm/vmalloc.c 41505c57WAd5l1rlfCLNSCpx9J13vA linux-2.4.29-xen-sparse/net/core/skbuff.c 40f562372u3A7_kfbYYixPHJJxYUxA linux-2.6.11-xen-sparse/arch/xen/Kconfig 40f56237utH41NPukqHksuNf29IC9A linux-2.6.11-xen-sparse/arch/xen/Kconfig.drivers diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S index cda41ae56c..c856a0bd29 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/head.S @@ -1,6 +1,9 @@ .section __xen_guest - .asciz "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000" + .ascii "GUEST_OS=linux,GUEST_VER=2.4,XEN_VER=3.0,VIRT_BASE=0xC0000000" + .ascii ",LOADER=generic" + .ascii ",PT_MODE_WRITABLE" + .byte 0 .text #include diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c index 374c9b6c30..61fc1eb824 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/ldt.c @@ -84,6 +84,7 @@ static inline int copy_ldt(mm_context_t *new, mm_context_t *old) } memcpy(new->ldt, old->ldt, old->size*LDT_ENTRY_SIZE); make_pages_readonly(new->ldt, (new->size*LDT_ENTRY_SIZE)/PAGE_SIZE); + flush_page_update_queue(); return 0; } diff --git a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c index ada06dd973..f593714e02 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c +++ b/linux-2.4.29-xen-sparse/arch/xen/kernel/traps.c @@ -623,6 +623,7 @@ void __init trap_init(void) set_call_gate(&default_ldt[0],lcall7); set_call_gate(&default_ldt[4],lcall27); __make_page_readonly(&default_ldt[0]); + flush_page_update_queue(); cpu_init(); } diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c index d19218fe32..49a0afc887 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/fault.c @@ -296,7 +296,6 @@ vmalloc_fault: if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); - XEN_flush_page_update_queue(); /* flush PMD update */ pte_k = pte_offset(pmd_k, address); if (!pte_present(*pte_k)) diff --git a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c index 40a5af9273..88d775bcd4 100644 --- a/linux-2.4.29-xen-sparse/arch/xen/mm/init.c +++ b/linux-2.4.29-xen-sparse/arch/xen/mm/init.c @@ -142,7 +142,7 @@ static inline void set_pte_phys (unsigned long vaddr, } pte = pte_offset(pmd, vaddr); - queue_l1_entry_update(pte, phys | pgprot_val(prot)); + set_pte(pte, (pte_t) { phys | pgprot_val(prot) }); /* * It's enough to flush this one mapping. @@ -201,17 +201,13 @@ static void __init fixrange_init (unsigned long start, kpgd = pgd_offset_k((unsigned long)pte); kpmd = pmd_offset(kpgd, (unsigned long)pte); kpte = pte_offset(kpmd, (unsigned long)pte); - queue_l1_entry_update(kpte, - (*(unsigned long *)kpte)&~_PAGE_RW); - + set_pte(kpte, pte_wrprotect(*kpte)); set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte))); } vaddr += PMD_SIZE; } j = 0; } - - XEN_flush_page_update_queue(); } @@ -257,10 +253,8 @@ static void __init pagetable_init (void) kpgd = pgd_offset_k((unsigned long)pte_base); kpmd = pmd_offset(kpgd, (unsigned long)pte_base); kpte = pte_offset(kpmd, (unsigned long)pte_base); - queue_l1_entry_update(kpte, - (*(unsigned long *)kpte)&~_PAGE_RW); + set_pte(kpte, pte_wrprotect(*kpte)); set_pmd(pmd, __pmd(_KERNPG_TABLE + __pa(pte_base))); - XEN_flush_page_update_queue(); } } @@ -311,6 +305,7 @@ void __init paging_init(void) pagetable_init(); zone_sizes_init(); + /* Switch to the real shared_info page, and clear the dummy page. */ set_fixmap(FIX_SHARED_INFO, xen_start_info.shared_info); HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); diff --git a/linux-2.4.29-xen-sparse/fs/exec.c b/linux-2.4.29-xen-sparse/fs/exec.c deleted file mode 100644 index 8a114151a9..0000000000 --- a/linux-2.4.29-xen-sparse/fs/exec.c +++ /dev/null @@ -1,1179 +0,0 @@ -/* - * linux/fs/exec.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -/* - * #!-checking implemented by tytso. - */ -/* - * Demand-loading implemented 01.12.91 - no need to read anything but - * the header into memory. The inode of the executable is put into - * "current->executable", and page faults do the actual loading. Clean. - * - * Once more I can proudly say that linux stood up to being changed: it - * was less than 2 hours work to get demand-loading completely implemented. - * - * Demand loading changed July 1993 by Eric Youngdale. Use mmap instead, - * current->executable is only used by the procfs. This allows a dispatch - * table to check for several different types of binary formats. We keep - * trying until we recognize the file or we run out of supported binary - * formats. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#define __NO_VERSION__ -#include - -#include -#include -#include - -#ifdef CONFIG_KMOD -#include -#endif - -int core_uses_pid; -char core_pattern[65] = "core"; -int core_setuid_ok = 0; -/* The maximal length of core_pattern is also specified in sysctl.c */ - -static struct linux_binfmt *formats; -static rwlock_t binfmt_lock = RW_LOCK_UNLOCKED; - -int register_binfmt(struct linux_binfmt * fmt) -{ - struct linux_binfmt ** tmp = &formats; - - if (!fmt) - return -EINVAL; - if (fmt->next) - return -EBUSY; - write_lock(&binfmt_lock); - while (*tmp) { - if (fmt == *tmp) { - write_unlock(&binfmt_lock); - return -EBUSY; - } - tmp = &(*tmp)->next; - } - fmt->next = formats; - formats = fmt; - write_unlock(&binfmt_lock); - return 0; -} - -int unregister_binfmt(struct linux_binfmt * fmt) -{ - struct linux_binfmt ** tmp = &formats; - - write_lock(&binfmt_lock); - while (*tmp) { - if (fmt == *tmp) { - *tmp = fmt->next; - write_unlock(&binfmt_lock); - return 0; - } - tmp = &(*tmp)->next; - } - write_unlock(&binfmt_lock); - return -EINVAL; -} - -static inline void put_binfmt(struct linux_binfmt * fmt) -{ - if (fmt->module) - __MOD_DEC_USE_COUNT(fmt->module); -} - -/* - * Note that a shared library must be both readable and executable due to - * security reasons. - * - * Also note that we take the address to load from from the file itself. - */ -asmlinkage long sys_uselib(const char * library) -{ - struct file * file; - struct nameidata nd; - int error; - - error = user_path_walk(library, &nd); - if (error) - goto out; - - error = -EINVAL; - if (!S_ISREG(nd.dentry->d_inode->i_mode)) - goto exit; - - error = permission(nd.dentry->d_inode, MAY_READ | MAY_EXEC); - if (error) - goto exit; - - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - - error = -ENOEXEC; - if(file->f_op && file->f_op->read) { - struct linux_binfmt * fmt; - - read_lock(&binfmt_lock); - for (fmt = formats ; fmt ; fmt = fmt->next) { - if (!fmt->load_shlib) - continue; - if (!try_inc_mod_count(fmt->module)) - continue; - read_unlock(&binfmt_lock); - error = fmt->load_shlib(file); - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (error != -ENOEXEC) - break; - } - read_unlock(&binfmt_lock); - } - fput(file); -out: - return error; -exit: - path_release(&nd); - goto out; -} - -/* - * count() counts the number of arguments/envelopes - */ -static int count(char ** argv, int max) -{ - int i = 0; - - if (argv != NULL) { - for (;;) { - char * p; - - if (get_user(p, argv)) - return -EFAULT; - if (!p) - break; - argv++; - if(++i > max) - return -E2BIG; - } - } - return i; -} - -/* - * 'copy_strings()' copies argument/envelope strings from user - * memory to free pages in kernel mem. These are in a format ready - * to be put directly into the top of new user memory. - */ -int copy_strings(int argc,char ** argv, struct linux_binprm *bprm) -{ - struct page *kmapped_page = NULL; - char *kaddr = NULL; - int ret; - - while (argc-- > 0) { - char *str; - int len; - unsigned long pos; - - if (get_user(str, argv+argc) || - !(len = strnlen_user(str, bprm->p))) { - ret = -EFAULT; - goto out; - } - - if (bprm->p < len) { - ret = -E2BIG; - goto out; - } - - bprm->p -= len; - /* XXX: add architecture specific overflow check here. */ - pos = bprm->p; - - while (len > 0) { - int i, new, err; - int offset, bytes_to_copy; - struct page *page; - - offset = pos % PAGE_SIZE; - i = pos/PAGE_SIZE; - page = bprm->page[i]; - new = 0; - if (!page) { - page = alloc_page(GFP_HIGHUSER); - bprm->page[i] = page; - if (!page) { - ret = -ENOMEM; - goto out; - } - new = 1; - } - - if (page != kmapped_page) { - if (kmapped_page) - kunmap(kmapped_page); - kmapped_page = page; - kaddr = kmap(kmapped_page); - } - if (new && offset) - memset(kaddr, 0, offset); - bytes_to_copy = PAGE_SIZE - offset; - if (bytes_to_copy > len) { - bytes_to_copy = len; - if (new) - memset(kaddr+offset+len, 0, - PAGE_SIZE-offset-len); - } - err = copy_from_user(kaddr+offset, str, bytes_to_copy); - if (err) { - ret = -EFAULT; - goto out; - } - - pos += bytes_to_copy; - str += bytes_to_copy; - len -= bytes_to_copy; - } - } - ret = 0; -out: - if (kmapped_page) - kunmap(kmapped_page); - return ret; -} - -/* - * Like copy_strings, but get argv and its values from kernel memory. - */ -int copy_strings_kernel(int argc,char ** argv, struct linux_binprm *bprm) -{ - int r; - mm_segment_t oldfs = get_fs(); - set_fs(KERNEL_DS); - r = copy_strings(argc, argv, bprm); - set_fs(oldfs); - return r; -} - -/* - * This routine is used to map in a page into an address space: needed by - * execve() for the initial stack and environment pages. - * - * tsk->mmap_sem is held for writing. - */ -void put_dirty_page(struct task_struct * tsk, struct page *page, unsigned long address) -{ - pgd_t * pgd; - pmd_t * pmd; - pte_t * pte; - struct vm_area_struct *vma; - pgprot_t prot = PAGE_COPY; - - if (page_count(page) != 1) - printk(KERN_ERR "mem_map disagrees with %p at %08lx\n", page, address); - pgd = pgd_offset(tsk->mm, address); - - spin_lock(&tsk->mm->page_table_lock); - pmd = pmd_alloc(tsk->mm, pgd, address); - if (!pmd) - goto out; - pte = pte_alloc(tsk->mm, pmd, address); - if (!pte) - goto out; - if (!pte_none(*pte)) - goto out; - lru_cache_add(page); - flush_dcache_page(page); - flush_page_to_ram(page); - /* lookup is cheap because there is only a single entry in the list */ - vma = find_vma(tsk->mm, address); - if (vma) - prot = vma->vm_page_prot; - set_pte(pte, pte_mkdirty(pte_mkwrite(mk_pte(page, prot)))); - XEN_flush_page_update_queue(); - tsk->mm->rss++; - spin_unlock(&tsk->mm->page_table_lock); - - /* no need for flush_tlb */ - return; -out: - spin_unlock(&tsk->mm->page_table_lock); - __free_page(page); - force_sig(SIGKILL, tsk); - return; -} - -int setup_arg_pages(struct linux_binprm *bprm) -{ - unsigned long stack_base; - struct vm_area_struct *mpnt; - int i, ret; - - stack_base = STACK_TOP - MAX_ARG_PAGES*PAGE_SIZE; - - bprm->p += stack_base; - if (bprm->loader) - bprm->loader += stack_base; - bprm->exec += stack_base; - - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); - if (!mpnt) - return -ENOMEM; - - down_write(¤t->mm->mmap_sem); - { - mpnt->vm_mm = current->mm; - mpnt->vm_start = PAGE_MASK & (unsigned long) bprm->p; - mpnt->vm_end = STACK_TOP; - mpnt->vm_flags = VM_STACK_FLAGS; - mpnt->vm_page_prot = protection_map[VM_STACK_FLAGS & 0x7]; - mpnt->vm_ops = NULL; - mpnt->vm_pgoff = 0; - mpnt->vm_file = NULL; - mpnt->vm_private_data = (void *) 0; - if ((ret = insert_vm_struct(current->mm, mpnt))) { - up_write(¤t->mm->mmap_sem); - kmem_cache_free(vm_area_cachep, mpnt); - return ret; - } - current->mm->total_vm = (mpnt->vm_end - mpnt->vm_start) >> PAGE_SHIFT; - } - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page *page = bprm->page[i]; - if (page) { - bprm->page[i] = NULL; - put_dirty_page(current,page,stack_base); - } - stack_base += PAGE_SIZE; - } - up_write(¤t->mm->mmap_sem); - - return 0; -} - -struct file *open_exec(const char *name) -{ - struct nameidata nd; - struct inode *inode; - struct file *file; - int err = 0; - - err = path_lookup(name, LOOKUP_FOLLOW|LOOKUP_POSITIVE, &nd); - file = ERR_PTR(err); - if (!err) { - inode = nd.dentry->d_inode; - file = ERR_PTR(-EACCES); - if (!(nd.mnt->mnt_flags & MNT_NOEXEC) && - S_ISREG(inode->i_mode)) { - int err = permission(inode, MAY_EXEC); - if (!err && !(inode->i_mode & 0111)) - err = -EACCES; - file = ERR_PTR(err); - if (!err) { - file = dentry_open(nd.dentry, nd.mnt, O_RDONLY); - if (!IS_ERR(file)) { - err = deny_write_access(file); - if (err) { - fput(file); - file = ERR_PTR(err); - } - } -out: - return file; - } - } - path_release(&nd); - } - goto out; -} - -int kernel_read(struct file *file, unsigned long offset, - char * addr, unsigned long count) -{ - mm_segment_t old_fs; - loff_t pos = offset; - int result = -ENOSYS; - - if (!file->f_op->read) - goto fail; - old_fs = get_fs(); - set_fs(get_ds()); - result = file->f_op->read(file, addr, count, &pos); - set_fs(old_fs); -fail: - return result; -} - -static int exec_mmap(void) -{ - struct mm_struct * mm, * old_mm; - - old_mm = current->mm; - - if (old_mm && atomic_read(&old_mm->mm_users) == 1) { - mm_release(); - down_write(&old_mm->mmap_sem); - exit_mmap(old_mm); - up_write(&old_mm->mmap_sem); - return 0; - } - - - mm = mm_alloc(); - if (mm) { - struct mm_struct *active_mm; - - if (init_new_context(current, mm)) { - mmdrop(mm); - return -ENOMEM; - } - - /* Add it to the list of mm's */ - spin_lock(&mmlist_lock); - list_add(&mm->mmlist, &init_mm.mmlist); - mmlist_nr++; - spin_unlock(&mmlist_lock); - - task_lock(current); - active_mm = current->active_mm; - current->mm = mm; - current->active_mm = mm; - task_unlock(current); - activate_mm(active_mm, mm); - mm_release(); - if (old_mm) { - if (active_mm != old_mm) BUG(); - mmput(old_mm); - return 0; - } - mmdrop(active_mm); - return 0; - } - return -ENOMEM; -} - -/* - * This function makes sure the current process has its own signal table, - * so that flush_signal_handlers can later reset the handlers without - * disturbing other processes. (Other processes might share the signal - * table via the CLONE_SIGNAL option to clone().) - */ - -static inline int make_private_signals(void) -{ - struct signal_struct * newsig; - - if (atomic_read(¤t->sig->count) <= 1) - return 0; - newsig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); - if (newsig == NULL) - return -ENOMEM; - spin_lock_init(&newsig->siglock); - atomic_set(&newsig->count, 1); - memcpy(newsig->action, current->sig->action, sizeof(newsig->action)); - spin_lock_irq(¤t->sigmask_lock); - current->sig = newsig; - spin_unlock_irq(¤t->sigmask_lock); - return 0; -} - -/* - * If make_private_signals() made a copy of the signal table, decrement the - * refcount of the original table, and free it if necessary. - * We don't do that in make_private_signals() so that we can back off - * in flush_old_exec() if an error occurs after calling make_private_signals(). - */ - -static inline void release_old_signals(struct signal_struct * oldsig) -{ - if (current->sig == oldsig) - return; - if (atomic_dec_and_test(&oldsig->count)) - kmem_cache_free(sigact_cachep, oldsig); -} - -/* - * These functions flushes out all traces of the currently running executable - * so that a new one can be started - */ - -static inline void flush_old_files(struct files_struct * files) -{ - long j = -1; - - write_lock(&files->file_lock); - for (;;) { - unsigned long set, i; - - j++; - i = j * __NFDBITS; - if (i >= files->max_fds || i >= files->max_fdset) - break; - set = files->close_on_exec->fds_bits[j]; - if (!set) - continue; - files->close_on_exec->fds_bits[j] = 0; - write_unlock(&files->file_lock); - for ( ; set ; i++,set >>= 1) { - if (set & 1) { - sys_close(i); - } - } - write_lock(&files->file_lock); - - } - write_unlock(&files->file_lock); -} - -/* - * An execve() will automatically "de-thread" the process. - * Note: we don't have to hold the tasklist_lock to test - * whether we migth need to do this. If we're not part of - * a thread group, there is no way we can become one - * dynamically. And if we are, we only need to protect the - * unlink - even if we race with the last other thread exit, - * at worst the list_del_init() might end up being a no-op. - */ -static inline void de_thread(struct task_struct *tsk) -{ - if (!list_empty(&tsk->thread_group)) { - write_lock_irq(&tasklist_lock); - list_del_init(&tsk->thread_group); - write_unlock_irq(&tasklist_lock); - } - - /* Minor oddity: this might stay the same. */ - tsk->tgid = tsk->pid; -} - -void get_task_comm(char *buf, struct task_struct *tsk) -{ - /* buf must be at least sizeof(tsk->comm) in size */ - task_lock(tsk); - memcpy(buf, tsk->comm, sizeof(tsk->comm)); - task_unlock(tsk); -} - -void set_task_comm(struct task_struct *tsk, char *buf) -{ - task_lock(tsk); - strncpy(tsk->comm, buf, sizeof(tsk->comm)); - tsk->comm[sizeof(tsk->comm)-1]='\0'; - task_unlock(tsk); -} - -int flush_old_exec(struct linux_binprm * bprm) -{ - char * name; - int i, ch, retval; - struct signal_struct * oldsig; - struct files_struct * files; - char tcomm[sizeof(current->comm)]; - - /* - * Make sure we have a private signal table - */ - oldsig = current->sig; - retval = make_private_signals(); - if (retval) goto flush_failed; - - /* - * Make sure we have private file handles. Ask the - * fork helper to do the work for us and the exit - * helper to do the cleanup of the old one. - */ - - files = current->files; /* refcounted so safe to hold */ - retval = unshare_files(); - if(retval) - goto flush_failed; - - /* - * Release all of the old mmap stuff - */ - retval = exec_mmap(); - if (retval) goto mmap_failed; - - /* This is the point of no return */ - steal_locks(files); - put_files_struct(files); - release_old_signals(oldsig); - - current->sas_ss_sp = current->sas_ss_size = 0; - - if (current->euid == current->uid && current->egid == current->gid) { - current->mm->dumpable = 1; - current->task_dumpable = 1; - } - name = bprm->filename; - for (i=0; (ch = *(name++)) != '\0';) { - if (ch == '/') - i = 0; - else - if (i < (sizeof(tcomm) - 1)) - tcomm[i++] = ch; - } - tcomm[i] = '\0'; - set_task_comm(current, tcomm); - - flush_thread(); - - de_thread(current); - - if (bprm->e_uid != current->euid || bprm->e_gid != current->egid || - permission(bprm->file->f_dentry->d_inode,MAY_READ)) - current->mm->dumpable = 0; - - /* An exec changes our domain. We are no longer part of the thread - group */ - - current->self_exec_id++; - - flush_signal_handlers(current); - flush_old_files(current->files); - - return 0; - -mmap_failed: - put_files_struct(current->files); - current->files = files; -flush_failed: - spin_lock_irq(¤t->sigmask_lock); - if (current->sig != oldsig) { - kmem_cache_free(sigact_cachep, current->sig); - current->sig = oldsig; - } - spin_unlock_irq(¤t->sigmask_lock); - return retval; -} - -/* - * We mustn't allow tracing of suid binaries, unless - * the tracer has the capability to trace anything.. - */ -static inline int must_not_trace_exec(struct task_struct * p) -{ - return (p->ptrace & PT_PTRACED) && !(p->ptrace & PT_PTRACE_CAP); -} - -/* - * Fill the binprm structure from the inode. - * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes - */ -int prepare_binprm(struct linux_binprm *bprm) -{ - int mode; - struct inode * inode = bprm->file->f_dentry->d_inode; - - mode = inode->i_mode; - /* - * Check execute perms again - if the caller has CAP_DAC_OVERRIDE, - * vfs_permission lets a non-executable through - */ - if (!(mode & 0111)) /* with at least _one_ execute bit set */ - return -EACCES; - if (bprm->file->f_op == NULL) - return -EACCES; - - bprm->e_uid = current->euid; - bprm->e_gid = current->egid; - - if(!(bprm->file->f_vfsmnt->mnt_flags & MNT_NOSUID)) { - /* Set-uid? */ - if (mode & S_ISUID) - bprm->e_uid = inode->i_uid; - - /* Set-gid? */ - /* - * If setgid is set but no group execute bit then this - * is a candidate for mandatory locking, not a setgid - * executable. - */ - if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) - bprm->e_gid = inode->i_gid; - } - - /* We don't have VFS support for capabilities yet */ - cap_clear(bprm->cap_inheritable); - cap_clear(bprm->cap_permitted); - cap_clear(bprm->cap_effective); - - /* To support inheritance of root-permissions and suid-root - * executables under compatibility mode, we raise all three - * capability sets for the file. - * - * If only the real uid is 0, we only raise the inheritable - * and permitted sets of the executable file. - */ - - if (!issecure(SECURE_NOROOT)) { - if (bprm->e_uid == 0 || current->uid == 0) { - cap_set_full(bprm->cap_inheritable); - cap_set_full(bprm->cap_permitted); - } - if (bprm->e_uid == 0) - cap_set_full(bprm->cap_effective); - } - - memset(bprm->buf,0,BINPRM_BUF_SIZE); - return kernel_read(bprm->file,0,bprm->buf,BINPRM_BUF_SIZE); -} - -/* - * This function is used to produce the new IDs and capabilities - * from the old ones and the file's capabilities. - * - * The formula used for evolving capabilities is: - * - * pI' = pI - * (***) pP' = (fP & X) | (fI & pI) - * pE' = pP' & fE [NB. fE is 0 or ~0] - * - * I=Inheritable, P=Permitted, E=Effective // p=process, f=file - * ' indicates post-exec(), and X is the global 'cap_bset'. - * - */ - -void compute_creds(struct linux_binprm *bprm) -{ - kernel_cap_t new_permitted, working; - int do_unlock = 0; - - new_permitted = cap_intersect(bprm->cap_permitted, cap_bset); - working = cap_intersect(bprm->cap_inheritable, - current->cap_inheritable); - new_permitted = cap_combine(new_permitted, working); - - if (bprm->e_uid != current->uid || bprm->e_gid != current->gid || - !cap_issubset(new_permitted, current->cap_permitted)) { - current->mm->dumpable = 0; - - lock_kernel(); - if (must_not_trace_exec(current) - || atomic_read(¤t->fs->count) > 1 - || atomic_read(¤t->files->count) > 1 - || atomic_read(¤t->sig->count) > 1) { - if(!capable(CAP_SETUID)) { - bprm->e_uid = current->uid; - bprm->e_gid = current->gid; - } - if(!capable(CAP_SETPCAP)) { - new_permitted = cap_intersect(new_permitted, - current->cap_permitted); - } - } - do_unlock = 1; - } - - - /* For init, we want to retain the capabilities set - * in the init_task struct. Thus we skip the usual - * capability rules */ - if (current->pid != 1) { - current->cap_permitted = new_permitted; - current->cap_effective = - cap_intersect(new_permitted, bprm->cap_effective); - } - - /* AUD: Audit candidate if current->cap_effective is set */ - - current->suid = current->euid = current->fsuid = bprm->e_uid; - current->sgid = current->egid = current->fsgid = bprm->e_gid; - - if(do_unlock) - unlock_kernel(); - current->keep_capabilities = 0; -} - - -void remove_arg_zero(struct linux_binprm *bprm) -{ - if (bprm->argc) { - unsigned long offset; - char * kaddr; - struct page *page; - - offset = bprm->p % PAGE_SIZE; - goto inside; - - while (bprm->p++, *(kaddr+offset++)) { - if (offset != PAGE_SIZE) - continue; - offset = 0; - kunmap(page); -inside: - page = bprm->page[bprm->p/PAGE_SIZE]; - kaddr = kmap(page); - } - kunmap(page); - bprm->argc--; - } -} - -/* - * cycle the list of binary formats handler, until one recognizes the image - */ -int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) -{ - int try,retval=0; - struct linux_binfmt *fmt; -#ifdef __alpha__ - /* handle /sbin/loader.. */ - { - struct exec * eh = (struct exec *) bprm->buf; - - if (!bprm->loader && eh->fh.f_magic == 0x183 && - (eh->fh.f_flags & 0x3000) == 0x3000) - { - struct file * file; - unsigned long loader; - - allow_write_access(bprm->file); - fput(bprm->file); - bprm->file = NULL; - - loader = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - - file = open_exec("/sbin/loader"); - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - /* Remember if the application is TASO. */ - bprm->sh_bang = eh->ah.entry < 0x100000000; - - bprm->file = file; - bprm->loader = loader; - retval = prepare_binprm(bprm); - if (retval<0) - return retval; - /* should call search_binary_handler recursively here, - but it does not matter */ - } - } -#endif - /* kernel module loader fixup */ - /* so we don't try to load run modprobe in kernel space. */ - set_fs(USER_DS); - for (try=0; try<2; try++) { - read_lock(&binfmt_lock); - for (fmt = formats ; fmt ; fmt = fmt->next) { - int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary; - if (!fn) - continue; - if (!try_inc_mod_count(fmt->module)) - continue; - read_unlock(&binfmt_lock); - retval = fn(bprm, regs); - if (retval >= 0) { - put_binfmt(fmt); - allow_write_access(bprm->file); - if (bprm->file) - fput(bprm->file); - bprm->file = NULL; - current->did_exec = 1; - return retval; - } - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (retval != -ENOEXEC) - break; - if (!bprm->file) { - read_unlock(&binfmt_lock); - return retval; - } - } - read_unlock(&binfmt_lock); - if (retval != -ENOEXEC) { - break; -#ifdef CONFIG_KMOD - }else{ -#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) - char modname[20]; - if (printable(bprm->buf[0]) && - printable(bprm->buf[1]) && - printable(bprm->buf[2]) && - printable(bprm->buf[3])) - break; /* -ENOEXEC */ - sprintf(modname, "binfmt-%04x", *(unsigned short *)(&bprm->buf[2])); - request_module(modname); -#endif - } - } - return retval; -} - - -/* - * sys_execve() executes a new program. - */ -int do_execve(char * filename, char ** argv, char ** envp, struct pt_regs * regs) -{ - struct linux_binprm bprm; - struct file *file; - int retval; - int i; - - file = open_exec(filename); - - retval = PTR_ERR(file); - if (IS_ERR(file)) - return retval; - - bprm.p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); - memset(bprm.page, 0, MAX_ARG_PAGES*sizeof(bprm.page[0])); - - bprm.file = file; - bprm.filename = filename; - bprm.sh_bang = 0; - bprm.loader = 0; - bprm.exec = 0; - if ((bprm.argc = count(argv, bprm.p / sizeof(void *))) < 0) { - allow_write_access(file); - fput(file); - return bprm.argc; - } - - if ((bprm.envc = count(envp, bprm.p / sizeof(void *))) < 0) { - allow_write_access(file); - fput(file); - return bprm.envc; - } - - retval = prepare_binprm(&bprm); - if (retval < 0) - goto out; - - retval = copy_strings_kernel(1, &bprm.filename, &bprm); - if (retval < 0) - goto out; - - bprm.exec = bprm.p; - retval = copy_strings(bprm.envc, envp, &bprm); - if (retval < 0) - goto out; - - retval = copy_strings(bprm.argc, argv, &bprm); - if (retval < 0) - goto out; - - retval = search_binary_handler(&bprm,regs); - if (retval >= 0) - /* execve success */ - return retval; - -out: - /* Something went wrong, return the inode and free the argument pages*/ - allow_write_access(bprm.file); - if (bprm.file) - fput(bprm.file); - - for (i = 0 ; i < MAX_ARG_PAGES ; i++) { - struct page * page = bprm.page[i]; - if (page) - __free_page(page); - } - - return retval; -} - -void set_binfmt(struct linux_binfmt *new) -{ - struct linux_binfmt *old = current->binfmt; - if (new && new->module) - __MOD_INC_USE_COUNT(new->module); - current->binfmt = new; - if (old && old->module) - __MOD_DEC_USE_COUNT(old->module); -} - -#define CORENAME_MAX_SIZE 64 - -/* format_corename will inspect the pattern parameter, and output a - * name into corename, which must have space for at least - * CORENAME_MAX_SIZE bytes plus one byte for the zero terminator. - */ -void format_corename(char *corename, const char *pattern, long signr) -{ - const char *pat_ptr = pattern; - char *out_ptr = corename; - char *const out_end = corename + CORENAME_MAX_SIZE; - int rc; - int pid_in_pattern = 0; - - /* Repeat as long as we have more pattern to process and more output - space */ - while (*pat_ptr) { - if (*pat_ptr != '%') { - if (out_ptr == out_end) - goto out; - *out_ptr++ = *pat_ptr++; - } else { - switch (*++pat_ptr) { - case 0: - goto out; - /* Double percent, output one percent */ - case '%': - if (out_ptr == out_end) - goto out; - *out_ptr++ = '%'; - break; - /* pid */ - case 'p': - pid_in_pattern = 1; - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->pid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* uid */ - case 'u': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->uid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* gid */ - case 'g': - rc = snprintf(out_ptr, out_end - out_ptr, - "%d", current->gid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* signal that caused the coredump */ - case 's': - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", signr); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* UNIX time of coredump */ - case 't': { - struct timeval tv; - do_gettimeofday(&tv); - rc = snprintf(out_ptr, out_end - out_ptr, - "%ld", tv.tv_sec); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - } - /* hostname */ - case 'h': - down_read(&uts_sem); - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", system_utsname.nodename); - up_read(&uts_sem); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - /* executable */ - case 'e': - rc = snprintf(out_ptr, out_end - out_ptr, - "%s", current->comm); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - break; - default: - break; - } - ++pat_ptr; - } - } - /* Backward compatibility with core_uses_pid: - * - * If core_pattern does not include a %p (as is the default) - * and core_uses_pid is set, then .%pid will be appended to - * the filename */ - if (!pid_in_pattern - && (core_uses_pid || atomic_read(¤t->mm->mm_users) != 1)) { - rc = snprintf(out_ptr, out_end - out_ptr, - ".%d", current->pid); - if (rc > out_end - out_ptr) - goto out; - out_ptr += rc; - } - out: - *out_ptr = 0; -} - -int do_coredump(long signr, struct pt_regs * regs) -{ - struct linux_binfmt * binfmt; - char corename[CORENAME_MAX_SIZE + 1]; - struct file * file; - struct inode * inode; - int retval = 0; - int fsuid = current->fsuid; - - lock_kernel(); - binfmt = current->binfmt; - if (!binfmt || !binfmt->core_dump) - goto fail; - if (!is_dumpable(current)) - { - if(!core_setuid_ok || !current->task_dumpable) - goto fail; - current->fsuid = 0; - } - current->mm->dumpable = 0; - if (current->rlim[RLIMIT_CORE].rlim_cur < binfmt->min_coredump) - goto fail; - - format_corename(corename, core_pattern, signr); - file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW, 0600); - if (IS_ERR(file)) - goto fail; - inode = file->f_dentry->d_inode; - if (inode->i_nlink > 1) - goto close_fail; /* multiple links - don't dump */ - if (d_unhashed(file->f_dentry)) - goto close_fail; - - if (!S_ISREG(inode->i_mode)) - goto close_fail; - if (!file->f_op) - goto close_fail; - if (!file->f_op->write) - goto close_fail; - if (do_truncate(file->f_dentry, 0) != 0) - goto close_fail; - - retval = binfmt->core_dump(signr, regs, file); - -close_fail: - filp_close(file, NULL); -fail: - if (fsuid != current->fsuid) - current->fsuid = fsuid; - unlock_kernel(); - return retval; -} diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/page.h b/linux-2.4.29-xen-sparse/include/asm-xen/page.h index fbab7f5ff1..3150545429 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/page.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/page.h @@ -85,23 +85,18 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pmd_val(pmd_t x) { unsigned long ret = x.pmd; - if ( (ret & 1) ) ret = machine_to_phys(ret); + if ( ret ) ret = machine_to_phys(ret) | 1; return ret; } #define pmd_val_ma(x) ((x).pmd) #define pgd_val(x) ({ BUG(); (unsigned long)0; }) #define pgprot_val(x) ((x).pgprot) -static inline pte_t __pte(unsigned long x) -{ - if ( (x & 1) ) x = phys_to_machine(x); - return ((pte_t) { (x) }); -} -static inline pmd_t __pmd(unsigned long x) -{ - if ( (x & 1) ) x = phys_to_machine(x); - return ((pmd_t) { (x) }); -} +#define __pte(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pte_t) {phys_to_machine(_x)}) : ((pte_t) {(_x)})); }) +#define __pte_ma(x) ((pte_t) { (x) } ) +#define __pmd(x) ({ unsigned long _x = (x); \ + (((_x)&1) ? ((pmd_t) {phys_to_machine(_x)}) : ((pmd_t) {(_x)})); }) #define __pgd(x) ({ BUG(); (pgprot_t) { 0 }; }) #define __pgprot(x) ((pgprot_t) { (x) } ) diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h index 4e9584e918..2a0c226c71 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgalloc.h @@ -22,7 +22,6 @@ #define pmd_populate(mm, pmd, pte) \ do { \ set_pmd(pmd, __pmd(_PAGE_TABLE + __pa(pte))); \ - XEN_flush_page_update_queue(); \ } while ( 0 ) /* @@ -79,8 +78,9 @@ static inline pgd_t *get_pgd_slow(void) memcpy(pgd + USER_PTRS_PER_PGD, init_mm.pgd + USER_PTRS_PER_PGD, (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); - __make_page_readonly(pgd); + __make_page_readonly(pgd); queue_pgd_pin(__pa(pgd)); + flush_page_update_queue(); } return pgd; } @@ -111,7 +111,8 @@ static inline void free_pgd_slow(pgd_t *pgd) kmem_cache_free(pae_pgd_cachep, pgd); #else queue_pgd_unpin(__pa(pgd)); - __make_page_writable(pgd); + __make_page_writable(pgd); + flush_page_update_queue(); free_page((unsigned long)pgd); #endif } @@ -135,6 +136,7 @@ static inline pte_t *pte_alloc_one(struct mm_struct *mm, unsigned long address) clear_page(pte); __make_page_readonly(pte); queue_pte_pin(__pa(pte)); + flush_page_update_queue(); } return pte; @@ -155,6 +157,7 @@ static __inline__ void pte_free_slow(pte_t *pte) { queue_pte_unpin(__pa(pte)); __make_page_writable(pte); + flush_page_update_queue(); free_page((unsigned long)pte); } @@ -208,22 +211,19 @@ extern int do_check_pgt_cache(int, int); static inline void flush_tlb_mm(struct mm_struct *mm) { - if (mm == current->active_mm) queue_tlb_flush(); - XEN_flush_page_update_queue(); + if (mm == current->active_mm) xen_tlb_flush(); } static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long addr) { - if (vma->vm_mm == current->active_mm) queue_invlpg(addr); - XEN_flush_page_update_queue(); + if (vma->vm_mm == current->active_mm) xen_invlpg(addr); } static inline void flush_tlb_range(struct mm_struct *mm, unsigned long start, unsigned long end) { - if (mm == current->active_mm) queue_tlb_flush(); - XEN_flush_page_update_queue(); + if (mm == current->active_mm) xen_tlb_flush(); } #else @@ -261,7 +261,6 @@ static inline void flush_tlb_pgtables(struct mm_struct *mm, unsigned long start, unsigned long end) { /* i386 does not keep any page table caches in TLB */ - XEN_flush_page_update_queue(); } /* diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h index d91b48360e..70f8356fb1 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable-2level.h @@ -34,9 +34,19 @@ static inline int pgd_bad(pgd_t pgd) { return 0; } static inline int pgd_present(pgd_t pgd) { return 1; } #define pgd_clear(xp) do { } while (0) -#define set_pte(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low) -#define set_pte_atomic(pteptr, pteval) queue_l1_entry_update(pteptr, (pteval).pte_low) -#define set_pmd(pmdptr, pmdval) queue_l2_entry_update((pmdptr), (pmdval)) +/* + * Certain architectures need to do special things when PTEs + * within a page table are directly modified. Thus, the following + * hook is made available. + */ +#define set_pte(pteptr, pteval) (*(pteptr) = pteval) +#define set_pte_atomic(pteptr, pteval) (*(pteptr) = pteval) + +/* + * (pmds are folded into pgds so this doesnt get actually called, + * but the define is needed for a generic inline function.) + */ +#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) #define set_pgd(pgdptr, pgdval) ((void)0) #define pgd_page(pgd) \ @@ -47,6 +57,7 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) return (pmd_t *) dir; } +#define ptep_get_and_clear(xp) __pte_ma(xchg(&(xp)->pte_low, 0)) #define pte_same(a, b) ((a).pte_low == (b).pte_low) /* @@ -83,21 +94,4 @@ static inline pmd_t * pmd_offset(pgd_t * dir, unsigned long address) #define pte_none(x) (!(x).pte_low) #define __mk_pte(page_nr,pgprot) __pte(((page_nr) << PAGE_SHIFT) | pgprot_val(pgprot)) -/* - * A note on implementation of this atomic 'get-and-clear' operation. - * This is actually very simple because XenoLinux can only run on a single - * processor. Therefore, we cannot race other processors setting the 'accessed' - * or 'dirty' bits on a page-table entry. - * Even if pages are shared between domains, that is not a problem because - * each domain will have separate page tables, with their own versions of - * accessed & dirty state. - */ -static inline pte_t ptep_get_and_clear(pte_t *xp) -{ - pte_t pte = *xp; - if ( !pte_none(pte) ) - queue_l1_entry_update(xp, 0); - return pte; -} - #endif /* _I386_PGTABLE_2LEVEL_H */ diff --git a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h index c15f0e9509..f5a53adc82 100644 --- a/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h +++ b/linux-2.4.29-xen-sparse/include/asm-xen/pgtable.h @@ -38,11 +38,11 @@ extern void paging_init(void); extern unsigned long pgkern_mask; -#define __flush_tlb() ({ queue_tlb_flush(); XEN_flush_page_update_queue(); }) +#define __flush_tlb() xen_tlb_flush() #define __flush_tlb_global() __flush_tlb() #define __flush_tlb_all() __flush_tlb_global() -#define __flush_tlb_one(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); }) -#define __flush_tlb_single(addr) ({ queue_invlpg(addr); XEN_flush_page_update_queue(); }) +#define __flush_tlb_one(addr) xen_invlpg(addr) +#define __flush_tlb_single(addr) xen_invlpg(addr) /* * ZERO_PAGE is a global shared page that is always zero: used @@ -179,12 +179,14 @@ extern void * high_memory; #define __S111 PAGE_SHARED #define pte_present(x) ((x).pte_low & (_PAGE_PRESENT | _PAGE_PROTNONE)) -#define pte_clear(xp) queue_l1_entry_update(xp, 0) +#define pte_clear(xp) do { set_pte(xp, __pte(0)); } while (0) -#define pmd_none(x) (!(x).pmd) -#define pmd_present(x) ((x).pmd & _PAGE_PRESENT) +#define pmd_none(x) (!pmd_val(x)) +/* pmd_present doesn't just test the _PAGE_PRESENT bit since wr.p.t. + can temporarily clear it. */ +#define pmd_present(x) (pmd_val(x)) #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) -#define pmd_bad(x) (((x).pmd & (~PAGE_MASK & ~_PAGE_USER)) != _KERNPG_TABLE) +#define pmd_bad(x) ((pmd_val(x) & (~PAGE_MASK & ~_PAGE_USER & ~_PAGE_PRESENT)) != (_KERNPG_TABLE & ~_PAGE_PRESENT)) #define pages_to_mb(x) ((x) >> (20-PAGE_SHIFT)) @@ -212,29 +214,28 @@ static inline pte_t pte_mkwrite(pte_t pte) { (pte).pte_low |= _PAGE_RW; return p static inline int ptep_test_and_clear_dirty(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - int ret = pteval & _PAGE_DIRTY; - if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_DIRTY); - return ret; + if (!pte_dirty(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } -static inline int ptep_test_and_clear_young(pte_t *ptep) + +static inline int ptep_test_and_clear_young(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - int ret = pteval & _PAGE_ACCESSED; - if ( ret ) queue_l1_entry_update(ptep, pteval & ~_PAGE_ACCESSED); - return ret; + if (!pte_young(*ptep)) + return 0; + return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte_low); } + static inline void ptep_set_wrprotect(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - if ( (pteval & _PAGE_RW) ) - queue_l1_entry_update(ptep, pteval & ~_PAGE_RW); + if (pte_write(*ptep)) + clear_bit(_PAGE_BIT_RW, &ptep->pte_low); } + static inline void ptep_mkdirty(pte_t *ptep) { - unsigned long pteval = *(unsigned long *)ptep; - if ( !(pteval & _PAGE_DIRTY) ) - queue_l1_entry_update(ptep, pteval | _PAGE_DIRTY); + if (!pte_dirty(*ptep)) + set_bit(_PAGE_BIT_DIRTY, &ptep->pte_low); } /* diff --git a/linux-2.4.29-xen-sparse/mm/highmem.c b/linux-2.4.29-xen-sparse/mm/highmem.c index 341e6e29a9..f8182820ac 100644 --- a/linux-2.4.29-xen-sparse/mm/highmem.c +++ b/linux-2.4.29-xen-sparse/mm/highmem.c @@ -122,7 +122,6 @@ start: } vaddr = PKMAP_ADDR(last_pkmap_nr); set_pte(&(pkmap_page_table[last_pkmap_nr]), mk_pte(page, kmap_prot)); - XEN_flush_page_update_queue(); pkmap_count[last_pkmap_nr] = 1; page->virtual = (void *) vaddr; diff --git a/linux-2.4.29-xen-sparse/mm/memory.c b/linux-2.4.29-xen-sparse/mm/memory.c index 7d81c86589..880b6981c4 100644 --- a/linux-2.4.29-xen-sparse/mm/memory.c +++ b/linux-2.4.29-xen-sparse/mm/memory.c @@ -153,7 +153,6 @@ void clear_page_tables(struct mm_struct *mm, unsigned long first, int nr) free_one_pgd(page_dir); page_dir++; } while (--nr); - XEN_flush_page_update_queue(); spin_unlock(&mm->page_table_lock); /* keep the page table cache within bounds */ @@ -249,10 +248,8 @@ skip_copy_pte_range: address = (address + PMD_SIZE) & PMD_MASK; /* If it's a COW mapping, write protect it both in the parent and the child */ if (cow && pte_write(pte)) { - /* XEN modification: modified ordering here to avoid RaW hazard. */ - pte = *src_pte; - pte = pte_wrprotect(pte); ptep_set_wrprotect(src_pte); + pte = *src_pte; } /* If it's a shared mapping, mark it clean in the child */ @@ -914,7 +911,6 @@ static inline void establish_pte(struct vm_area_struct * vma, unsigned long addr { #ifdef CONFIG_XEN if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); HYPERVISOR_update_va_mapping(address, entry, UVMF_INVLPG); } else { set_pte(page_table, entry); @@ -1189,13 +1185,10 @@ static int do_swap_page(struct mm_struct * mm, flush_page_to_ram(page); flush_icache_page(vma, page); #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(address, pte, 0); - } else { + else set_pte(page_table, pte); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, pte); #endif @@ -1245,13 +1238,10 @@ static int do_anonymous_page(struct mm_struct * mm, struct vm_area_struct * vma, } #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(addr, entry, 0); - } else { + else set_pte(page_table, entry); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, entry); #endif @@ -1331,13 +1321,10 @@ static int do_no_page(struct mm_struct * mm, struct vm_area_struct * vma, if (write_access) entry = pte_mkwrite(pte_mkdirty(entry)); #ifdef CONFIG_XEN - if ( likely(vma->vm_mm == current->mm) ) { - XEN_flush_page_update_queue(); + if ( likely(vma->vm_mm == current->mm) ) HYPERVISOR_update_va_mapping(address, entry, 0); - } else { + else set_pte(page_table, entry); - XEN_flush_page_update_queue(); - } #else set_pte(page_table, entry); #endif @@ -1484,7 +1471,6 @@ pte_t fastcall *pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long addres /* "fast" allocation can happen without dropping the lock.. */ new = pte_alloc_one_fast(mm, address); if (!new) { - XEN_flush_page_update_queue(); spin_unlock(&mm->page_table_lock); new = pte_alloc_one(mm, address); spin_lock(&mm->page_table_lock); diff --git a/linux-2.4.29-xen-sparse/mm/mremap.c b/linux-2.4.29-xen-sparse/mm/mremap.c index 330e194bae..475c308b1b 100644 --- a/linux-2.4.29-xen-sparse/mm/mremap.c +++ b/linux-2.4.29-xen-sparse/mm/mremap.c @@ -119,11 +119,9 @@ static int move_page_tables(struct mm_struct * mm, * the old page tables) */ oops_we_failed: - XEN_flush_page_update_queue(); flush_cache_range(mm, new_addr, new_addr + len); while ((offset += PAGE_SIZE) < len) move_one_page(mm, new_addr + offset, old_addr + offset); - XEN_flush_page_update_queue(); zap_page_range(mm, new_addr, len); return -1; } diff --git a/linux-2.4.29-xen-sparse/mm/swapfile.c b/linux-2.4.29-xen-sparse/mm/swapfile.c deleted file mode 100644 index 6457f19b74..0000000000 --- a/linux-2.4.29-xen-sparse/mm/swapfile.c +++ /dev/null @@ -1,1267 +0,0 @@ -/* - * linux/mm/swapfile.c - * - * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds - * Swap reorganised 29.12.95, Stephen Tweedie - */ - -#include -#include -#include -#include -#include -#include /* for blk_size */ -#include -#include -#include - -#include - -spinlock_t swaplock = SPIN_LOCK_UNLOCKED; -unsigned int nr_swapfiles; -int total_swap_pages; -static int swap_overflow; - -static const char Bad_file[] = "Bad swap file entry "; -static const char Unused_file[] = "Unused swap file entry "; -static const char Bad_offset[] = "Bad swap offset entry "; -static const char Unused_offset[] = "Unused swap offset entry "; - -struct swap_list_t swap_list = {-1, -1}; - -struct swap_info_struct swap_info[MAX_SWAPFILES]; - -#define SWAPFILE_CLUSTER 256 - -static inline int scan_swap_map(struct swap_info_struct *si) -{ - unsigned long offset; - /* - * We try to cluster swap pages by allocating them - * sequentially in swap. Once we've allocated - * SWAPFILE_CLUSTER pages this way, however, we resort to - * first-free allocation, starting a new cluster. This - * prevents us from scattering swap pages all over the entire - * swap partition, so that we reduce overall disk seek times - * between swap pages. -- sct */ - if (si->cluster_nr) { - while (si->cluster_next <= si->highest_bit) { - offset = si->cluster_next++; - if (si->swap_map[offset]) - continue; - si->cluster_nr--; - goto got_page; - } - } - si->cluster_nr = SWAPFILE_CLUSTER; - - /* try to find an empty (even not aligned) cluster. */ - offset = si->lowest_bit; - check_next_cluster: - if (offset+SWAPFILE_CLUSTER-1 <= si->highest_bit) - { - int nr; - for (nr = offset; nr < offset+SWAPFILE_CLUSTER; nr++) - if (si->swap_map[nr]) - { - offset = nr+1; - goto check_next_cluster; - } - /* We found a completly empty cluster, so start - * using it. - */ - goto got_page; - } - /* No luck, so now go finegrined as usual. -Andrea */ - for (offset = si->lowest_bit; offset <= si->highest_bit ; offset++) { - if (si->swap_map[offset]) - continue; - si->lowest_bit = offset+1; - got_page: - if (offset == si->lowest_bit) - si->lowest_bit++; - if (offset == si->highest_bit) - si->highest_bit--; - if (si->lowest_bit > si->highest_bit) { - si->lowest_bit = si->max; - si->highest_bit = 0; - } - si->swap_map[offset] = 1; - nr_swap_pages--; - si->cluster_next = offset+1; - return offset; - } - si->lowest_bit = si->max; - si->highest_bit = 0; - return 0; -} - -swp_entry_t get_swap_page(void) -{ - struct swap_info_struct * p; - unsigned long offset; - swp_entry_t entry; - int type, wrapped = 0; - - entry.val = 0; /* Out of memory */ - swap_list_lock(); - type = swap_list.next; - if (type < 0) - goto out; - if (nr_swap_pages <= 0) - goto out; - - while (1) { - p = &swap_info[type]; - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - swap_device_lock(p); - offset = scan_swap_map(p); - swap_device_unlock(p); - if (offset) { - entry = SWP_ENTRY(type,offset); - type = swap_info[type].next; - if (type < 0 || - p->prio != swap_info[type].prio) { - swap_list.next = swap_list.head; - } else { - swap_list.next = type; - } - goto out; - } - } - type = p->next; - if (!wrapped) { - if (type < 0 || p->prio != swap_info[type].prio) { - type = swap_list.head; - wrapped = 1; - } - } else - if (type < 0) - goto out; /* out of swap space */ - } -out: - swap_list_unlock(); - return entry; -} - -static struct swap_info_struct * swap_info_get(swp_entry_t entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - - if (!entry.val) - goto out; - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) - goto bad_nofile; - p = & swap_info[type]; - if (!(p->flags & SWP_USED)) - goto bad_device; - offset = SWP_OFFSET(entry); - if (offset >= p->max) - goto bad_offset; - if (!p->swap_map[offset]) - goto bad_free; - swap_list_lock(); - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = type; - swap_device_lock(p); - return p; - -bad_free: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_offset, entry.val); - goto out; -bad_offset: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_offset, entry.val); - goto out; -bad_device: - printk(KERN_ERR "swap_free: %s%08lx\n", Unused_file, entry.val); - goto out; -bad_nofile: - printk(KERN_ERR "swap_free: %s%08lx\n", Bad_file, entry.val); -out: - return NULL; -} - -static void swap_info_put(struct swap_info_struct * p) -{ - swap_device_unlock(p); - swap_list_unlock(); -} - -static int swap_entry_free(struct swap_info_struct *p, unsigned long offset) -{ - int count = p->swap_map[offset]; - - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = count; - if (!count) { - if (offset < p->lowest_bit) - p->lowest_bit = offset; - if (offset > p->highest_bit) - p->highest_bit = offset; - nr_swap_pages++; - } - } - return count; -} - -/* - * Caller has made sure that the swapdevice corresponding to entry - * is still around or has not been recycled. - */ -void swap_free(swp_entry_t entry) -{ - struct swap_info_struct * p; - - p = swap_info_get(entry); - if (p) { - swap_entry_free(p, SWP_OFFSET(entry)); - swap_info_put(p); - } -} - -/* - * Check if we're the only user of a swap page, - * when the page is locked. - */ -static int exclusive_swap_page(struct page *page) -{ - int retval = 0; - struct swap_info_struct * p; - swp_entry_t entry; - - entry.val = page->index; - p = swap_info_get(entry); - if (p) { - /* Is the only swap cache user the cache itself? */ - if (p->swap_map[SWP_OFFSET(entry)] == 1) { - /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) - retval = 1; - spin_unlock(&pagecache_lock); - } - swap_info_put(p); - } - return retval; -} - -/* - * We can use this swap cache entry directly - * if there are no other references to it. - * - * Here "exclusive_swap_page()" does the real - * work, but we opportunistically check whether - * we need to get all the locks first.. - */ -int fastcall can_share_swap_page(struct page *page) -{ - int retval = 0; - - if (!PageLocked(page)) - BUG(); - switch (page_count(page)) { - case 3: - if (!page->buffers) - break; - /* Fallthrough */ - case 2: - if (!PageSwapCache(page)) - break; - retval = exclusive_swap_page(page); - break; - case 1: - if (PageReserved(page)) - break; - retval = 1; - } - return retval; -} - -/* - * Work out if there are any other processes sharing this - * swap cache page. Free it if you can. Return success. - */ -int fastcall remove_exclusive_swap_page(struct page *page) -{ - int retval; - struct swap_info_struct * p; - swp_entry_t entry; - - if (!PageLocked(page)) - BUG(); - if (!PageSwapCache(page)) - return 0; - if (page_count(page) - !!page->buffers != 2) /* 2: us + cache */ - return 0; - - entry.val = page->index; - p = swap_info_get(entry); - if (!p) - return 0; - - /* Is the only swap cache user the cache itself? */ - retval = 0; - if (p->swap_map[SWP_OFFSET(entry)] == 1) { - /* Recheck the page count with the pagecache lock held.. */ - spin_lock(&pagecache_lock); - if (page_count(page) - !!page->buffers == 2) { - __delete_from_swap_cache(page); - SetPageDirty(page); - retval = 1; - } - spin_unlock(&pagecache_lock); - } - swap_info_put(p); - - if (retval) { - block_flushpage(page, 0); - swap_free(entry); - page_cache_release(page); - } - - return retval; -} - -/* - * Free the swap entry like above, but also try to - * free the page cache entry if it is the last user. - */ -void free_swap_and_cache(swp_entry_t entry) -{ - struct swap_info_struct * p; - struct page *page = NULL; - - p = swap_info_get(entry); - if (p) { - if (swap_entry_free(p, SWP_OFFSET(entry)) == 1) - page = find_trylock_page(&swapper_space, entry.val); - swap_info_put(p); - } - if (page) { - page_cache_get(page); - /* Only cache user (+us), or swap space full? Free it! */ - if (page_count(page) - !!page->buffers == 2 || vm_swap_full()) { - delete_from_swap_cache(page); - SetPageDirty(page); - } - UnlockPage(page); - page_cache_release(page); - } -} - -/* - * The swap entry has been read in advance, and we return 1 to indicate - * that the page has been used or is no longer needed. - * - * Always set the resulting pte to be nowrite (the same as COW pages - * after one process has exited). We don't know just how many PTEs will - * share this swap entry, so be cautious and let do_wp_page work out - * what to do if a write is requested later. - */ -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pte(struct vm_area_struct * vma, unsigned long address, - pte_t *dir, swp_entry_t entry, struct page* page) -{ - pte_t pte = *dir; - - if (likely(pte_to_swp_entry(pte).val != entry.val)) - return; - if (unlikely(pte_none(pte) || pte_present(pte))) - return; - get_page(page); - set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); - swap_free(entry); - ++vma->vm_mm->rss; -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, - unsigned long address, unsigned long size, unsigned long offset, - swp_entry_t entry, struct page* page) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*dir)) - return; - if (pmd_bad(*dir)) { - pmd_ERROR(*dir); - pmd_clear(dir); - return; - } - pte = pte_offset(dir, address); - offset += address & PMD_MASK; - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - unuse_pte(vma, offset+address-vma->vm_start, pte, entry, page); - address += PAGE_SIZE; - pte++; - } while (address && (address < end)); -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static inline void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, - unsigned long address, unsigned long size, - swp_entry_t entry, struct page* page) -{ - pmd_t * pmd; - unsigned long offset, end; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, address); - offset = address & PGDIR_MASK; - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - if (address >= end) - BUG(); - do { - unuse_pmd(vma, pmd, address, end - address, offset, entry, - page); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address && (address < end)); -} - -/* mmlist_lock and vma->vm_mm->page_table_lock are held */ -static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, - swp_entry_t entry, struct page* page) -{ - unsigned long start = vma->vm_start, end = vma->vm_end; - - if (start >= end) - BUG(); - do { - unuse_pgd(vma, pgdir, start, end - start, entry, page); - start = (start + PGDIR_SIZE) & PGDIR_MASK; - pgdir++; - } while (start && (start < end)); -} - -static void unuse_process(struct mm_struct * mm, - swp_entry_t entry, struct page* page) -{ - struct vm_area_struct* vma; - - /* - * Go through process' page directory. - */ - spin_lock(&mm->page_table_lock); - for (vma = mm->mmap; vma; vma = vma->vm_next) { - pgd_t * pgd = pgd_offset(mm, vma->vm_start); - unuse_vma(vma, pgd, entry, page); - } - XEN_flush_page_update_queue(); - spin_unlock(&mm->page_table_lock); - return; -} - -/* - * Scan swap_map from current position to next entry still in use. - * Recycle to start on reaching the end, returning 0 when empty. - */ -static int find_next_to_unuse(struct swap_info_struct *si, int prev) -{ - int max = si->max; - int i = prev; - int count; - - /* - * No need for swap_device_lock(si) here: we're just looking - * for whether an entry is in use, not modifying it; false - * hits are okay, and sys_swapoff() has already prevented new - * allocations from this area (while holding swap_list_lock()). - */ - for (;;) { - if (++i >= max) { - if (!prev) { - i = 0; - break; - } - /* - * No entries in use at top of swap_map, - * loop back to start and recheck there. - */ - max = prev + 1; - prev = 0; - i = 1; - } - count = si->swap_map[i]; - if (count && count != SWAP_MAP_BAD) - break; - } - return i; -} - -/* - * We completely avoid races by reading each swap page in advance, - * and then search for the process using it. All the necessary - * page table adjustments can then be made atomically. - */ -static int try_to_unuse(unsigned int type) -{ - struct swap_info_struct * si = &swap_info[type]; - struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; - struct page *page; - swp_entry_t entry; - int i = 0; - int retval = 0; - int reset_overflow = 0; - int shmem; - - /* - * When searching mms for an entry, a good strategy is to - * start at the first mm we freed the previous entry from - * (though actually we don't notice whether we or coincidence - * freed the entry). Initialize this start_mm with a hold. - * - * A simpler strategy would be to start at the last mm we - * freed the previous entry from; but that would take less - * advantage of mmlist ordering (now preserved by swap_out()), - * which clusters forked address spaces together, most recent - * child immediately after parent. If we race with dup_mmap(), - * we very much want to resolve parent before child, otherwise - * we may miss some entries: using last mm would invert that. - */ - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - - /* - * Keep on scanning until all entries have gone. Usually, - * one pass through swap_map is enough, but not necessarily: - * mmput() removes mm from mmlist before exit_mmap() and its - * zap_page_range(). That's not too bad, those entries are - * on their way out, and handled faster there than here. - * do_munmap() behaves similarly, taking the range out of mm's - * vma list before zap_page_range(). But unfortunately, when - * unmapping a part of a vma, it takes the whole out first, - * then reinserts what's left after (might even reschedule if - * open() method called) - so swap entries may be invisible - * to swapoff for a while, then reappear - but that is rare. - */ - while ((i = find_next_to_unuse(si, i))) { - /* - * Get a page for the entry, using the existing swap - * cache page if there is one. Otherwise, get a clean - * page and read the swap into it. - */ - swap_map = &si->swap_map[i]; - entry = SWP_ENTRY(type, i); - page = read_swap_cache_async(entry); - if (!page) { - /* - * Either swap_duplicate() failed because entry - * has been freed independently, and will not be - * reused since sys_swapoff() already disabled - * allocation from here, or alloc_page() failed. - */ - if (!*swap_map) - continue; - retval = -ENOMEM; - break; - } - - /* - * Don't hold on to start_mm if it looks like exiting. - */ - if (atomic_read(&start_mm->mm_users) == 1) { - mmput(start_mm); - start_mm = &init_mm; - atomic_inc(&init_mm.mm_users); - } - - /* - * Wait for and lock page. When do_swap_page races with - * try_to_unuse, do_swap_page can handle the fault much - * faster than try_to_unuse can locate the entry. This - * apparently redundant "wait_on_page" lets try_to_unuse - * defer to do_swap_page in such a case - in some tests, - * do_swap_page and try_to_unuse repeatedly compete. - */ - wait_on_page(page); - lock_page(page); - - /* - * Remove all references to entry, without blocking. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. - */ - shmem = 0; - swcount = *swap_map; - if (swcount > 1) { - flush_page_to_ram(page); - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - unuse_process(start_mm, entry, page); - } - if (*swap_map > 1) { - int set_start_mm = (*swap_map >= swcount); - struct list_head *p = &start_mm->mmlist; - struct mm_struct *new_start_mm = start_mm; - struct mm_struct *mm; - - spin_lock(&mmlist_lock); - while (*swap_map > 1 && - (p = p->next) != &start_mm->mmlist) { - mm = list_entry(p, struct mm_struct, mmlist); - swcount = *swap_map; - if (mm == &init_mm) { - set_start_mm = 1; - spin_unlock(&mmlist_lock); - shmem = shmem_unuse(entry, page); - spin_lock(&mmlist_lock); - } else - unuse_process(mm, entry, page); - if (set_start_mm && *swap_map < swcount) { - new_start_mm = mm; - set_start_mm = 0; - } - } - atomic_inc(&new_start_mm->mm_users); - spin_unlock(&mmlist_lock); - mmput(start_mm); - start_mm = new_start_mm; - } - - /* - * How could swap count reach 0x7fff when the maximum - * pid is 0x7fff, and there's no way to repeat a swap - * page within an mm (except in shmem, where it's the - * shared object which takes the reference count)? - * We believe SWAP_MAP_MAX cannot occur in Linux 2.4. - * - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - if (*swap_map == SWAP_MAP_MAX) { - swap_list_lock(); - swap_device_lock(si); - nr_swap_pages++; - *swap_map = 1; - swap_device_unlock(si); - swap_list_unlock(); - reset_overflow = 1; - } - - /* - * If a reference remains (rare), we would like to leave - * the page in the swap cache; but try_to_swap_out could - * then re-duplicate the entry once we drop page lock, - * so we might loop indefinitely; also, that page could - * not be swapped out to other storage meanwhile. So: - * delete from cache even if there's another reference, - * after ensuring that the data has been saved to disk - - * since if the reference remains (rarer), it will be - * read from disk into another page. Splitting into two - * pages would be incorrect if swap supported "shared - * private" pages, but they are handled by tmpfs files. - * - * Note shmem_unuse already deleted swappage from cache, - * unless corresponding filepage found already in cache: - * in which case it left swappage in cache, lowered its - * swap count to pass quickly through the loops above, - * and now we must reincrement count to try again later. - */ - if ((*swap_map > 1) && PageDirty(page) && PageSwapCache(page)) { - rw_swap_page(WRITE, page); - lock_page(page); - } - if (PageSwapCache(page)) { - if (shmem) - swap_duplicate(entry); - else - delete_from_swap_cache(page); - } - - /* - * So we could skip searching mms once swap count went - * to 1, we did not mark any present ptes as dirty: must - * mark page dirty so try_to_swap_out will preserve it. - */ - SetPageDirty(page); - UnlockPage(page); - page_cache_release(page); - - /* - * Make sure that we aren't completely killing - * interactive performance. Interruptible check on - * signal_pending() would be nice, but changes the spec? - */ - if (current->need_resched) - schedule(); - } - - mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } - return retval; -} - -asmlinkage long sys_swapoff(const char * specialfile) -{ - struct swap_info_struct * p = NULL; - unsigned short *swap_map; - struct nameidata nd; - int i, type, prev; - int err; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - err = user_path_walk(specialfile, &nd); - if (err) - goto out; - - lock_kernel(); - prev = -1; - swap_list_lock(); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; - if ((p->flags & SWP_WRITEOK) == SWP_WRITEOK) { - if (p->swap_file == nd.dentry) - break; - } - prev = type; - } - err = -EINVAL; - if (type < 0) { - swap_list_unlock(); - goto out_dput; - } - - if (prev < 0) { - swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } - nr_swap_pages -= p->pages; - total_swap_pages -= p->pages; - p->flags = SWP_USED; - swap_list_unlock(); - unlock_kernel(); - err = try_to_unuse(type); - lock_kernel(); - if (err) { - /* re-insert swap space back into swap_list */ - swap_list_lock(); - for (prev = -1, i = swap_list.head; i >= 0; prev = i, i = swap_info[i].next) - if (p->prio >= swap_info[i].prio) - break; - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; - else - swap_info[prev].next = p - swap_info; - nr_swap_pages += p->pages; - total_swap_pages += p->pages; - p->flags = SWP_WRITEOK; - swap_list_unlock(); - goto out_dput; - } - if (p->swap_device) - blkdev_put(p->swap_file->d_inode->i_bdev, BDEV_SWAP); - path_release(&nd); - - swap_list_lock(); - swap_device_lock(p); - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_vfsmnt = NULL; - p->swap_file = NULL; - p->swap_device = 0; - p->max = 0; - swap_map = p->swap_map; - p->swap_map = NULL; - p->flags = 0; - swap_device_unlock(p); - swap_list_unlock(); - vfree(swap_map); - err = 0; - -out_dput: - unlock_kernel(); - path_release(&nd); -out: - return err; -} - -int get_swaparea_info(char *buf) -{ - char * page = (char *) __get_free_page(GFP_KERNEL); - struct swap_info_struct *ptr = swap_info; - int i, j, len = 0, usedswap; - - if (!page) - return -ENOMEM; - - len += sprintf(buf, "Filename\t\t\tType\t\tSize\tUsed\tPriority\n"); - for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { - if ((ptr->flags & SWP_USED) && ptr->swap_map) { - char * path = d_path(ptr->swap_file, ptr->swap_vfsmnt, - page, PAGE_SIZE); - - len += sprintf(buf + len, "%-31s ", path); - - if (!ptr->swap_device) - len += sprintf(buf + len, "file\t\t"); - else - len += sprintf(buf + len, "partition\t"); - - usedswap = 0; - for (j = 0; j < ptr->max; ++j) - switch (ptr->swap_map[j]) { - case SWAP_MAP_BAD: - case 0: - continue; - default: - usedswap++; - } - len += sprintf(buf + len, "%d\t%d\t%d\n", ptr->pages << (PAGE_SHIFT - 10), - usedswap << (PAGE_SHIFT - 10), ptr->prio); - } - } - free_page((unsigned long) page); - return len; -} - -int is_swap_partition(kdev_t dev) { - struct swap_info_struct *ptr = swap_info; - int i; - - for (i = 0 ; i < nr_swapfiles ; i++, ptr++) { - if (ptr->flags & SWP_USED) - if (ptr->swap_device == dev) - return 1; - } - return 0; -} - -/* - * Written 01/25/92 by Simmule Turner, heavily changed by Linus. - * - * The swapon system call - */ -asmlinkage long sys_swapon(const char * specialfile, int swap_flags) -{ - struct swap_info_struct * p; - struct nameidata nd; - struct inode * swap_inode; - unsigned int type; - int i, j, prev; - int error; - static int least_priority = 0; - union swap_header *swap_header = 0; - int swap_header_version; - int nr_good_pages = 0; - unsigned long maxpages = 1; - int swapfilesize; - struct block_device *bdev = NULL; - unsigned short *swap_map; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - lock_kernel(); - swap_list_lock(); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) - break; - error = -EPERM; - if (type >= MAX_SWAPFILES) { - swap_list_unlock(); - goto out; - } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - p->flags = SWP_USED; - p->swap_file = NULL; - p->swap_vfsmnt = NULL; - p->swap_device = 0; - p->swap_map = NULL; - p->lowest_bit = 0; - p->highest_bit = 0; - p->cluster_nr = 0; - p->sdev_lock = SPIN_LOCK_UNLOCKED; - p->next = -1; - if (swap_flags & SWAP_FLAG_PREFER) { - p->prio = - (swap_flags & SWAP_FLAG_PRIO_MASK)>>SWAP_FLAG_PRIO_SHIFT; - } else { - p->prio = --least_priority; - } - swap_list_unlock(); - error = user_path_walk(specialfile, &nd); - if (error) - goto bad_swap_2; - - p->swap_file = nd.dentry; - p->swap_vfsmnt = nd.mnt; - swap_inode = nd.dentry->d_inode; - error = -EINVAL; - - if (S_ISBLK(swap_inode->i_mode)) { - kdev_t dev = swap_inode->i_rdev; - struct block_device_operations *bdops; - devfs_handle_t de; - - if (is_mounted(dev)) { - error = -EBUSY; - goto bad_swap_2; - } - - p->swap_device = dev; - set_blocksize(dev, PAGE_SIZE); - - bd_acquire(swap_inode); - bdev = swap_inode->i_bdev; - de = devfs_get_handle_from_inode(swap_inode); - bdops = devfs_get_ops(de); /* Increments module use count */ - if (bdops) bdev->bd_op = bdops; - - error = blkdev_get(bdev, FMODE_READ|FMODE_WRITE, 0, BDEV_SWAP); - devfs_put_ops(de);/*Decrement module use count now we're safe*/ - if (error) - goto bad_swap_2; - set_blocksize(dev, PAGE_SIZE); - error = -ENODEV; - if (!dev || (blk_size[MAJOR(dev)] && - !blk_size[MAJOR(dev)][MINOR(dev)])) - goto bad_swap; - swapfilesize = 0; - if (blk_size[MAJOR(dev)]) - swapfilesize = blk_size[MAJOR(dev)][MINOR(dev)] - >> (PAGE_SHIFT - 10); - } else if (S_ISREG(swap_inode->i_mode)) - swapfilesize = swap_inode->i_size >> PAGE_SHIFT; - else - goto bad_swap; - - error = -EBUSY; - for (i = 0 ; i < nr_swapfiles ; i++) { - struct swap_info_struct *q = &swap_info[i]; - if (i == type || !q->swap_file) - continue; - if (swap_inode->i_mapping == q->swap_file->d_inode->i_mapping) - goto bad_swap; - } - - swap_header = (void *) __get_free_page(GFP_USER); - if (!swap_header) { - printk("Unable to start swapping: out of memory :-)\n"); - error = -ENOMEM; - goto bad_swap; - } - - lock_page(virt_to_page(swap_header)); - rw_swap_page_nolock(READ, SWP_ENTRY(type,0), (char *) swap_header); - - if (!memcmp("SWAP-SPACE",swap_header->magic.magic,10)) - swap_header_version = 1; - else if (!memcmp("SWAPSPACE2",swap_header->magic.magic,10)) - swap_header_version = 2; - else { - printk("Unable to find swap-space signature\n"); - error = -EINVAL; - goto bad_swap; - } - - switch (swap_header_version) { - case 1: - memset(((char *) swap_header)+PAGE_SIZE-10,0,10); - j = 0; - p->lowest_bit = 0; - p->highest_bit = 0; - for (i = 1 ; i < 8*PAGE_SIZE ; i++) { - if (test_bit(i,(char *) swap_header)) { - if (!p->lowest_bit) - p->lowest_bit = i; - p->highest_bit = i; - maxpages = i+1; - j++; - } - } - nr_good_pages = j; - p->swap_map = vmalloc(maxpages * sizeof(short)); - if (!p->swap_map) { - error = -ENOMEM; - goto bad_swap; - } - for (i = 1 ; i < maxpages ; i++) { - if (test_bit(i,(char *) swap_header)) - p->swap_map[i] = 0; - else - p->swap_map[i] = SWAP_MAP_BAD; - } - break; - - case 2: - /* Check the swap header's sub-version and the size of - the swap file and bad block lists */ - if (swap_header->info.version != 1) { - printk(KERN_WARNING - "Unable to handle swap header version %d\n", - swap_header->info.version); - error = -EINVAL; - goto bad_swap; - } - - p->lowest_bit = 1; - maxpages = SWP_OFFSET(SWP_ENTRY(0,~0UL)) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; - p->highest_bit = maxpages - 1; - - error = -EINVAL; - if (swap_header->info.nr_badpages > MAX_SWAP_BADPAGES) - goto bad_swap; - - /* OK, set up the swap map and apply the bad block list */ - if (!(p->swap_map = vmalloc(maxpages * sizeof(short)))) { - error = -ENOMEM; - goto bad_swap; - } - - error = 0; - memset(p->swap_map, 0, maxpages * sizeof(short)); - for (i=0; iinfo.nr_badpages; i++) { - int page = swap_header->info.badpages[i]; - if (page <= 0 || page >= swap_header->info.last_page) - error = -EINVAL; - else - p->swap_map[page] = SWAP_MAP_BAD; - } - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (error) - goto bad_swap; - } - - if (swapfilesize && maxpages > swapfilesize) { - printk(KERN_WARNING - "Swap area shorter than signature indicates\n"); - error = -EINVAL; - goto bad_swap; - } - if (!nr_good_pages) { - printk(KERN_WARNING "Empty swap-file\n"); - error = -EINVAL; - goto bad_swap; - } - p->swap_map[0] = SWAP_MAP_BAD; - swap_list_lock(); - swap_device_lock(p); - p->max = maxpages; - p->flags = SWP_WRITEOK; - p->pages = nr_good_pages; - nr_swap_pages += nr_good_pages; - total_swap_pages += nr_good_pages; - printk(KERN_INFO "Adding Swap: %dk swap-space (priority %d)\n", - nr_good_pages<<(PAGE_SHIFT-10), p->prio); - - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { - break; - } - prev = i; - } - p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } - swap_device_unlock(p); - swap_list_unlock(); - error = 0; - goto out; -bad_swap: - if (bdev) - blkdev_put(bdev, BDEV_SWAP); -bad_swap_2: - swap_list_lock(); - swap_map = p->swap_map; - nd.mnt = p->swap_vfsmnt; - nd.dentry = p->swap_file; - p->swap_device = 0; - p->swap_file = NULL; - p->swap_vfsmnt = NULL; - p->swap_map = NULL; - p->flags = 0; - if (!(swap_flags & SWAP_FLAG_PREFER)) - ++least_priority; - swap_list_unlock(); - if (swap_map) - vfree(swap_map); - path_release(&nd); -out: - if (swap_header) - free_page((long) swap_header); - unlock_kernel(); - return error; -} - -void si_swapinfo(struct sysinfo *val) -{ - unsigned int i; - unsigned long nr_to_be_unused = 0; - - swap_list_lock(); - for (i = 0; i < nr_swapfiles; i++) { - unsigned int j; - if (swap_info[i].flags != SWP_USED) - continue; - for (j = 0; j < swap_info[i].max; ++j) { - switch (swap_info[i].swap_map[j]) { - case 0: - case SWAP_MAP_BAD: - continue; - default: - nr_to_be_unused++; - } - } - } - val->freeswap = nr_swap_pages + nr_to_be_unused; - val->totalswap = total_swap_pages + nr_to_be_unused; - swap_list_unlock(); -} - -/* - * Verify that a swap entry is valid and increment its swap map count. - * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. - */ -int swap_duplicate(swp_entry_t entry) -{ - struct swap_info_struct * p; - unsigned long offset, type; - int result = 0; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) - goto bad_file; - p = type + swap_info; - offset = SWP_OFFSET(entry); - - swap_device_lock(p); - if (offset < p->max && p->swap_map[offset]) { - if (p->swap_map[offset] < SWAP_MAP_MAX - 1) { - p->swap_map[offset]++; - result = 1; - } else if (p->swap_map[offset] <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = SWAP_MAP_MAX; - result = 1; - } - } - swap_device_unlock(p); -out: - return result; - -bad_file: - printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); - goto out; -} - -/* - * Prior swap_duplicate protects against swap device deletion. - */ -void get_swaphandle_info(swp_entry_t entry, unsigned long *offset, - kdev_t *dev, struct inode **swapf) -{ - unsigned long type; - struct swap_info_struct *p; - - type = SWP_TYPE(entry); - if (type >= nr_swapfiles) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_file, entry.val); - return; - } - - p = &swap_info[type]; - *offset = SWP_OFFSET(entry); - if (*offset >= p->max && *offset != 0) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Bad_offset, entry.val); - return; - } - if (p->swap_map && !p->swap_map[*offset]) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_offset, entry.val); - return; - } - if (!(p->flags & SWP_USED)) { - printk(KERN_ERR "rw_swap_page: %s%08lx\n", Unused_file, entry.val); - return; - } - - if (p->swap_device) { - *dev = p->swap_device; - } else if (p->swap_file) { - *swapf = p->swap_file->d_inode; - } else { - printk(KERN_ERR "rw_swap_page: no swap file or device\n"); - } - return; -} - -/* - * swap_device_lock prevents swap_map being freed. Don't grab an extra - * reference on the swaphandle, it doesn't matter if it becomes unused. - */ -int valid_swaphandles(swp_entry_t entry, unsigned long *offset) -{ - int ret = 0, i = 1 << page_cluster; - unsigned long toff; - struct swap_info_struct *swapdev = SWP_TYPE(entry) + swap_info; - - if (!page_cluster) /* no readahead */ - return 0; - toff = (SWP_OFFSET(entry) >> page_cluster) << page_cluster; - if (!toff) /* first page is swap header */ - toff++, i--; - *offset = toff; - - swap_device_lock(swapdev); - do { - /* Don't read-ahead past the end of the swap area */ - if (toff >= swapdev->max) - break; - /* Don't read in free or bad pages */ - if (!swapdev->swap_map[toff]) - break; - if (swapdev->swap_map[toff] == SWAP_MAP_BAD) - break; - toff++; - ret++; - } while (--i); - swap_device_unlock(swapdev); - return ret; -} diff --git a/linux-2.4.29-xen-sparse/mm/vmalloc.c b/linux-2.4.29-xen-sparse/mm/vmalloc.c deleted file mode 100644 index df02fcbf7a..0000000000 --- a/linux-2.4.29-xen-sparse/mm/vmalloc.c +++ /dev/null @@ -1,385 +0,0 @@ -/* - * linux/mm/vmalloc.c - * - * Copyright (C) 1993 Linus Torvalds - * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999 - * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -rwlock_t vmlist_lock = RW_LOCK_UNLOCKED; -struct vm_struct * vmlist; - -static inline void free_area_pte(pmd_t * pmd, unsigned long address, unsigned long size) -{ - pte_t * pte; - unsigned long end; - - if (pmd_none(*pmd)) - return; - if (pmd_bad(*pmd)) { - pmd_ERROR(*pmd); - pmd_clear(pmd); - return; - } - pte = pte_offset(pmd, address); - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - pte_t page; - page = ptep_get_and_clear(pte); - address += PAGE_SIZE; - pte++; - if (pte_none(page)) - continue; - if (pte_present(page)) { - struct page *ptpage = pte_page(page); - if (VALID_PAGE(ptpage) && (!PageReserved(ptpage))) - __free_page(ptpage); - continue; - } - printk(KERN_CRIT "Whee.. Swapped out page in kernel page table\n"); - } while (address < end); -} - -static inline void free_area_pmd(pgd_t * dir, unsigned long address, unsigned long size) -{ - pmd_t * pmd; - unsigned long end; - - if (pgd_none(*dir)) - return; - if (pgd_bad(*dir)) { - pgd_ERROR(*dir); - pgd_clear(dir); - return; - } - pmd = pmd_offset(dir, address); - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - free_area_pte(pmd, address, end - address); - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); -} - -void vmfree_area_pages(unsigned long address, unsigned long size) -{ - pgd_t * dir; - unsigned long end = address + size; - - dir = pgd_offset_k(address); - flush_cache_all(); - do { - free_area_pmd(dir, address, end - address); - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - flush_tlb_all(); -} - -static inline int alloc_area_pte (pte_t * pte, unsigned long address, - unsigned long size, int gfp_mask, - pgprot_t prot, struct page ***pages) -{ - unsigned long end; - - address &= ~PMD_MASK; - end = address + size; - if (end > PMD_SIZE) - end = PMD_SIZE; - do { - struct page * page; - - if (!pages) { - spin_unlock(&init_mm.page_table_lock); - page = alloc_page(gfp_mask); - spin_lock(&init_mm.page_table_lock); - } else { - page = (**pages); - (*pages)++; - - /* Add a reference to the page so we can free later */ - if (page) - atomic_inc(&page->count); - - } - if (!pte_none(*pte)) - printk(KERN_ERR "alloc_area_pte: page already exists\n"); - if (!page) - return -ENOMEM; - set_pte(pte, mk_pte(page, prot)); - address += PAGE_SIZE; - pte++; - } while (address < end); - return 0; -} - -static inline int alloc_area_pmd(pmd_t * pmd, unsigned long address, - unsigned long size, int gfp_mask, - pgprot_t prot, struct page ***pages) -{ - unsigned long end; - - address &= ~PGDIR_MASK; - end = address + size; - if (end > PGDIR_SIZE) - end = PGDIR_SIZE; - do { - pte_t * pte = pte_alloc(&init_mm, pmd, address); - if (!pte) - return -ENOMEM; - if (alloc_area_pte(pte, address, end - address, - gfp_mask, prot, pages)) - return -ENOMEM; - address = (address + PMD_SIZE) & PMD_MASK; - pmd++; - } while (address < end); - return 0; -} - -/*static inline*/ int __vmalloc_area_pages (unsigned long address, - unsigned long size, - int gfp_mask, - pgprot_t prot, - struct page ***pages) -{ - pgd_t * dir; - unsigned long start = address; - unsigned long end = address + size; - - dir = pgd_offset_k(address); - spin_lock(&init_mm.page_table_lock); - do { - pmd_t *pmd; - - pmd = pmd_alloc(&init_mm, dir, address); - if (!pmd) - goto err; - - if (alloc_area_pmd(pmd, address, end - address, gfp_mask, prot, pages)) - goto err; // The kernel NEVER reclaims pmds, so no need to undo pmd_alloc() here - - address = (address + PGDIR_SIZE) & PGDIR_MASK; - dir++; - } while (address && (address < end)); - spin_unlock(&init_mm.page_table_lock); - flush_cache_all(); - XEN_flush_page_update_queue(); - return 0; -err: - spin_unlock(&init_mm.page_table_lock); - flush_cache_all(); - if (address > start) - vmfree_area_pages(start, address - start); - return -ENOMEM; -} - -int vmalloc_area_pages(unsigned long address, unsigned long size, - int gfp_mask, pgprot_t prot) -{ - return __vmalloc_area_pages(address, size, gfp_mask, prot, NULL); -} - -struct vm_struct * get_vm_area(unsigned long size, unsigned long flags) -{ - unsigned long addr, next; - struct vm_struct **p, *tmp, *area; - - area = (struct vm_struct *) kmalloc(sizeof(*area), GFP_KERNEL); - if (!area) - return NULL; - - size += PAGE_SIZE; - if (!size) { - kfree (area); - return NULL; - } - - addr = VMALLOC_START; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) ; p = &tmp->next) { - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long) tmp->addr) - break; - next = tmp->size + (unsigned long) tmp->addr; - if (next > addr) - addr = next; - if (addr > VMALLOC_END-size) - goto out; - } - area->flags = flags; - area->addr = (void *)addr; - area->size = size; - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - return NULL; -} - -void __vfree(void * addr, int free_area_pages) -{ - struct vm_struct **p, *tmp; - - if (!addr) - return; - if ((PAGE_SIZE-1) & (unsigned long) addr) { - printk(KERN_ERR "Trying to vfree() bad address (%p)\n", addr); - return; - } - write_lock(&vmlist_lock); - for (p = &vmlist ; (tmp = *p) ; p = &tmp->next) { - if (tmp->addr == addr) { - *p = tmp->next; - if (free_area_pages) - vmfree_area_pages(VMALLOC_VMADDR(tmp->addr), tmp->size); - write_unlock(&vmlist_lock); - kfree(tmp); - return; - } - } - write_unlock(&vmlist_lock); - printk(KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); -} - -void vfree(void * addr) -{ - __vfree(addr,1); -} - -void * __vmalloc (unsigned long size, int gfp_mask, pgprot_t prot) -{ - void * addr; - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) - return NULL; - area = get_vm_area(size, VM_ALLOC); - if (!area) - return NULL; - addr = area->addr; - if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, gfp_mask, - prot, NULL)) { - __vfree(addr, 0); - return NULL; - } - return addr; -} - -void * vmap(struct page **pages, int count, - unsigned long flags, pgprot_t prot) -{ - void * addr; - struct vm_struct *area; - unsigned long size = count << PAGE_SHIFT; - - if (!size || size > (max_mapnr << PAGE_SHIFT)) - return NULL; - area = get_vm_area(size, flags); - if (!area) { - return NULL; - } - addr = area->addr; - if (__vmalloc_area_pages(VMALLOC_VMADDR(addr), size, 0, - prot, &pages)) { - __vfree(addr, 0); - return NULL; - } - return addr; -} - -long vread(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - *buf = '\0'; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} - -long vwrite(char *buf, char *addr, unsigned long count) -{ - struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; - - /* Don't allow overflow */ - if ((unsigned long) addr + count < count) - count = -(unsigned long) addr; - - read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { - vaddr = (char *) tmp->addr; - if (addr >= vaddr + tmp->size - PAGE_SIZE) - continue; - while (addr < vaddr) { - if (count == 0) - goto finished; - buf++; - addr++; - count--; - } - n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); - } -finished: - read_unlock(&vmlist_lock); - return buf - buf_start; -} diff --git a/linux-2.6.11-xen-sparse/arch/xen/Kconfig b/linux-2.6.11-xen-sparse/arch/xen/Kconfig index 2a8c5f200f..1c2ba9b4a2 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/Kconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/Kconfig @@ -114,10 +114,6 @@ config XEN_BLKDEV_TAP to a character device, allowing device prototyping in application space. Odds are that you want to say N here. -config XEN_WRITABLE_PAGETABLES - bool - default y - config XEN_SCRUB_PAGES bool "Scrub memory before freeing it to Xen" default y diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig index e906f98521..a781740c94 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xen0_defconfig @@ -19,7 +19,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set -CONFIG_XEN_WRITABLE_PAGETABLES=y CONFIG_XEN_SCRUB_PAGES=y CONFIG_X86=y # CONFIG_X86_64 is not set diff --git a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig index 95dee5b159..b1fc951a81 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig +++ b/linux-2.6.11-xen-sparse/arch/xen/configs/xenU_defconfig @@ -16,7 +16,6 @@ CONFIG_XEN_BLKDEV_FRONTEND=y CONFIG_XEN_NETDEV_FRONTEND=y # CONFIG_XEN_NETDEV_FRONTEND_PIPELINED_TRANSMITTER is not set # CONFIG_XEN_BLKDEV_TAP is not set -CONFIG_XEN_WRITABLE_PAGETABLES=y CONFIG_XEN_SCRUB_PAGES=y CONFIG_X86=y # CONFIG_X86_64 is not set diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c index b8829c8cdc..b7c29174fc 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/kernel/traps.c @@ -963,7 +963,7 @@ void __init trap_init(void) * and a callgate to lcall27 for Solaris/x86 binaries */ make_lowmem_page_readonly(&default_ldt[0]); - xen_flush_page_update_queue(); + flush_page_update_queue(); /* * Should be a barrier for any external CPU state. diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c index 7a0b091ca3..0cac0f30c3 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/fault.c @@ -553,7 +553,6 @@ vmalloc_fault: if (!pmd_present(*pmd_k)) goto no_context; set_pmd(pmd, *pmd_k); - xen_flush_page_update_queue(); /* flush PMD update */ pte_k = pte_offset_kernel(pmd_k, address); if (!pte_present(*pte_k)) diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c index 62427b2301..368179d560 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/hypervisor.c @@ -48,19 +48,12 @@ */ static spinlock_t update_lock = SPIN_LOCK_UNLOCKED; -/* Linux 2.6 isn't using the traditional batched interface. */ +#define QUEUE_SIZE 1 /*128*/ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) -#define QUEUE_SIZE 2048 #define pte_offset_kernel pte_offset -#define pmd_val_ma(v) (v).pmd; #define pud_t pgd_t #define pud_offset(d, va) d #else -#ifdef CONFIG_SMP -#define QUEUE_SIZE 1 -#else -#define QUEUE_SIZE 128 -#endif #define pmd_val_ma(v) (v).pud.pgd.pgd; #endif diff --git a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c index 6fe3f08632..2682ac5b90 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c +++ b/linux-2.6.11-xen-sparse/arch/xen/i386/mm/pgtable.c @@ -195,7 +195,7 @@ pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) pte_t *pte = (pte_t *)__get_free_page(GFP_KERNEL|__GFP_REPEAT|__GFP_ZERO); if (pte) { make_page_readonly(pte); - xen_flush_page_update_queue(); + flush_page_update_queue(); } return pte; } diff --git a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c index f69db851a4..36c934fc5d 100644 --- a/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c +++ b/linux-2.6.11-xen-sparse/arch/xen/kernel/reboot.c @@ -109,10 +109,8 @@ static void __do_suspend(void) HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); -#ifdef CONFIG_XEN_WRITABLE_PAGETABLES HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_writable_pagetables); -#endif shutting_down = -1; diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h index 345b8264b8..1379b49694 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/page.h @@ -111,7 +111,7 @@ typedef struct { unsigned long pgprot; } pgprot_t; static inline unsigned long pgd_val(pgd_t x) { unsigned long ret = x.pgd; - if (ret) ret = machine_to_phys(ret); + if (ret) ret = machine_to_phys(ret) | 1; return ret; } #define pgprot_val(x) ((x).pgprot) diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h index d932c6c17f..dfc5b1e155 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/asm-i386/pgtable.h @@ -407,7 +407,6 @@ extern void noexec_setup(const char *str); do { \ if (__dirty) { \ if ( likely((__vma)->vm_mm == current->mm) ) { \ - xen_flush_page_update_queue(); \ HYPERVISOR_update_va_mapping((__address), (__entry), UVMF_INVLPG); \ } else { \ xen_l1_entry_update((__ptep), (__entry).pte_low); \ @@ -426,7 +425,6 @@ do { \ #define ptep_establish_new(__vma, __address, __ptep, __entry) \ do { \ if (likely((__vma)->vm_mm == current->mm)) { \ - xen_flush_page_update_queue(); \ HYPERVISOR_update_va_mapping((__address), \ __entry, 0); \ } else { \ diff --git a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h index 4d77312f6e..568e84bc2f 100644 --- a/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h +++ b/linux-2.6.11-xen-sparse/include/asm-xen/hypervisor.h @@ -117,8 +117,6 @@ void _flush_page_update_queue(void); if (per_cpu(mmu_update_queue_idx, smp_processor_id())) \ _flush_page_update_queue(); \ } while (0) -#define xen_flush_page_update_queue() (_flush_page_update_queue()) -#define XEN_flush_page_update_queue() (_flush_page_update_queue()) void MULTICALL_flush_page_update_queue(void); #ifdef CONFIG_XEN_PHYSDEV_ACCESS -- 2.30.2